Commit 2d0df08e authored by Flavio Paiva Junqueira's avatar Flavio Paiva Junqueira
Browse files

ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)



git-svn-id: https://svn.apache.org/repos/asf/zookeeper/branches/branch-3.4@1534388 13f79535-47bb-0310-9956-ffa450edef68
parent 0130ef91
......@@ -137,6 +137,8 @@ BUGFIXES:
ZOOKEEPER-1558. Leader should not snapshot uncommitted state (fpj)
ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)
IMPROVEMENTS:
ZOOKEEPER-1564. Allow JUnit test build with IBM Java
......
......@@ -66,6 +66,14 @@ public class FastLeaderElection implements Election {
*/
final static int maxNotificationInterval = 60000;
/**
* This value is passed to the methods that check the quorum
* majority of an established ensemble for those values that
* should not be taken into account in the comparison
* (electionEpoch and zxid).
*/
final static int IGNOREVALUE = -1;
/**
* Connection manager. Fast leader election uses TCP for
......@@ -324,7 +332,7 @@ public class FastLeaderElection implements Election {
ToSend.mType.notification,
current.getId(),
current.getZxid(),
logicalclock,
current.getElectionEpoch(),
self.getPeerState(),
response.sid,
current.getPeerEpoch());
......@@ -867,15 +875,25 @@ public class FastLeaderElection implements Election {
}
}
/**
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
* Only peer epoch is used to check that the votes come
* from the same ensemble. This is because there is at
* least one corner case in which the ensemble can be
* created with inconsistent zxid and election epoch
* info. However, given that only one ensemble can be
* running at a single point in time and that each
* epoch is used only once, using only the epoch to
* compare the votes is sufficient.
*
* @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
*/
outofelection.put(n.sid, new Vote(n.leader, n.zxid,
n.electionEpoch, n.peerEpoch, n.state));
outofelection.put(n.sid, new Vote(n.leader,
IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
if (termPredicate(outofelection, new Vote(n.leader,
n.zxid, n.electionEpoch, n.peerEpoch, n.state))
&& checkLeader(outofelection, n.leader, n.electionEpoch)) {
IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
&& checkLeader(outofelection, n.leader, IGNOREVALUE)) {
synchronized(this){
logicalclock = n.electionEpoch;
self.setPeerState((n.leader == self.getId()) ?
......
......@@ -945,6 +945,15 @@ public class Leader {
+ " ]; starting up and setting last processed zxid: 0x{}",
Long.toHexString(zk.getZxid()));
zk.startup();
/*
* Update the election vote here to ensure that all members of the
* ensemble report the same vote to new servers that start up and
* send leader election notifications to the ensemble.
*
* @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
*/
self.updateElectionVote(getEpoch());
zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());
}
......
......@@ -435,6 +435,15 @@ public class Learner {
writePacket(ack, true);
sock.setSoTimeout(self.tickTime * self.syncLimit);
zk.startup();
/*
* Update the election vote here to ensure that all members of the
* ensemble report the same vote to new servers that start up and
* send leader election notifications to the ensemble.
*
* @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
*/
self.updateElectionVote(newEpoch);
// We need to log the stuff that came in between the snapshot and the uptodate
if (zk instanceof FollowerZooKeeperServer) {
FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;
......
......@@ -1194,4 +1194,21 @@ public class QuorumPeer extends Thread implements QuorumStats.Provider {
acceptedEpoch = e;
writeLongToFile(ACCEPTED_EPOCH_FILENAME, e);
}
/**
* Updates leader election info to avoid inconsistencies when
* a new server tries to join the ensemble.
* See ZOOKEEPER-1732 for more info.
*/
protected void updateElectionVote(long newEpoch) {
Vote currentVote = getCurrentVote();
if (currentVote != null) {
setCurrentVote(new Vote(currentVote.getId(),
currentVote.getZxid(),
currentVote.getElectionEpoch(),
newEpoch,
currentVote.getState()));
}
}
}
......@@ -403,4 +403,67 @@ public class FLETest extends ZKTestCase {
}
}
}
/*
* For ZOOKEEPER-1732 verify that it is possible to join an ensemble with
* inconsistent election round information.
*/
@Test
public void testJoinInconsistentEnsemble() throws Exception {
int sid;
QuorumPeer peer;
int waitTime = 10 * 1000;
ArrayList<QuorumPeer> peerList = new ArrayList<QuorumPeer>();
for(sid = 0; sid < 3; sid++) {
peers.put(Long.valueOf(sid),
new QuorumServer(sid,
new InetSocketAddress(PortAssignment.unique()),
new InetSocketAddress(PortAssignment.unique())));
tmpdir[sid] = ClientBase.createTmpDir();
port[sid] = PortAssignment.unique();
}
// start 2 peers and verify if they form the cluster
for (sid = 0; sid < 2; sid++) {
peer = new QuorumPeer(peers, tmpdir[sid], tmpdir[sid],
port[sid], 3, sid, 2000, 2, 2);
LOG.info("Starting peer " + peer.getId());
peer.start();
peerList.add(sid, peer);
}
peer = peerList.get(0);
VerifyState v1 = new VerifyState(peerList.get(0));
v1.start();
v1.join(waitTime);
Assert.assertFalse("Unable to form cluster in " +
waitTime + " ms",
!v1.isSuccess());
// Change the election round for one of the members of the ensemble
long leaderSid = peer.getCurrentVote().getId();
long zxid = peer.getCurrentVote().getZxid();
long electionEpoch = peer.getCurrentVote().getElectionEpoch();
ServerState state = peer.getCurrentVote().getState();
long peerEpoch = peer.getCurrentVote().getPeerEpoch();
Vote newVote = new Vote(leaderSid, zxid+100, electionEpoch+100, peerEpoch, state);
peer.setCurrentVote(newVote);
// Start 3rd peer and check if it joins the quorum
peer = new QuorumPeer(peers, tmpdir[2], tmpdir[2],
port[2], 3, 2, 2000, 2, 2);
LOG.info("Starting peer " + peer.getId());
peer.start();
peerList.add(sid, peer);
v1 = new VerifyState(peer);
v1.start();
v1.join(waitTime);
if (v1.isAlive()) {
Assert.fail("Peer " + peer.getId() + " failed to join the cluster " +
"within " + waitTime + " ms");
}
// cleanup
for (int id = 0; id < 3; id++) {
peer = peerList.get(id);
if (peer != null) {
peer.shutdown();
}
}
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment